
library(CongressData)
library(cspp)

library(tidyverse) 
library(here)

#==============================================================================#
# Figure 1: CSPP Cumulative Plot
#==============================================================================#

first_years <- csppData::correlates %>%
  summarise(across(-year, ~min(year[!is.na(.)]))) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "first_year")

variable_counts <- first_years %>%
  group_by(first_year) %>%
  summarise(variable_count = n()) %>%
  arrange(first_year) %>%
  ungroup() %>%
  mutate(cumulative_variables = cumsum(variable_count))

ggplot(variable_counts, aes(x = first_year, y = cumulative_variables)) +
  geom_line(color = "#2C3E50", size = 1, lineend = "round") + 
  scale_x_continuous(limits = c(1900, 2020), breaks = c(1900,1925,1950,1975,2000,2020)) +
  theme_minimal() +
  theme(text = element_text(family = "Fira Sans"),  
        axis.text.x = element_text(hjust = 1),
        panel.grid.major = element_blank(),
        panel.border = element_rect(color = "#2C3E50", fill = NA, size = .5),
        # panel.grid.minor = element_blank(),
        plot.title = element_text(hjust = 0.5)) +
  labs(title = "Figure 1. Correlates: Cumulative Number of Available Variables",
       x = "Year",
       y = "Number of Variables") 

#==============================================================================#
# Figure 2: CONGRESS Cumulative Plot
#==============================================================================#

cong <- CongressData::get_cong_data()

variable_counts <- cong %>%
  summarise(across(-year, ~min(year[!is.na(.)]))) %>%
  pivot_longer(cols = everything(), 
               names_to = "variable", 
               values_to = "first_year") %>%
  group_by(first_year) %>%
  summarise(variable_count = n()) %>%
  arrange(first_year) %>%
  mutate(cumulative_variables = cumsum(variable_count))

m <- max(variable_counts$cumulative_variables)

# carry max on through final years
variable_counts <- variable_counts %>%
  add_row(first_year = c(2012:max(cong$year)), variable_count = rep(0, 13), cumulative_variables = rep(m,13))

ggplot(variable_counts, aes(x = first_year, y = cumulative_variables)) +
  geom_line(color = "#2C3E50", size = 1, lineend = "round") + 
  scale_x_continuous(limits = c(1785, 2024), breaks = c(1789,1925,1950,1975,2000,2024)) +
  scale_y_continuous(limits = c(0, 1100), breaks = c(0,250,500,750,1000)) +
  theme_minimal() +
  theme(text = element_text(family = "Fira Sans"),  
        axis.text.x = element_text(hjust = 1),
        panel.grid.major = element_blank(),
        panel.border = element_rect(color = "#2C3E50", fill = NA, size = .5),
        plot.title = element_text(hjust = 0.5)) +
  labs(title = "Figure 2. CongressData: Cumulative Number of Available Variables",
       x = "Year",
       y = "Number of Variables") 

#==============================================================================#
# Figure 3: CSPP Lollipop
#==============================================================================#

cspp      <- cspp::get_cspp_data()
cspp_old  <- read_csv(here("data","cspp_june_2021.csv"))

cspp_cats <- cspp::get_var_info() %>%
  mutate(category = str_to_title(category)) %>%
  mutate(category = case_when(
    category %in% c("Criminal Justice", "Drug-Alcohol", "Gun Control", "Rights") ~ "Social Issues",
    category %in% c("Economic-Fiscal", "Labor", "Welfare", "Transportation") ~ "Economic & Labor",
    category %in% c("Government", "Elections") ~ "Governance",
    category %in% c("Policy-Ideology", "Misc. Regulation") ~ "Policy & Ideology",
    category %in% c("Healthcare", "Demographics", "Education") ~ "Public Health",
    category == "Environment" ~ "Environment",
    TRUE ~ "Other"
  ))

cspp_cats_new <- cspp_cats %>%
  group_by(category) %>%
  summarise(n = n(), .groups = "drop")

cspp_cats_old <- cspp_cats %>%
  filter(variable %in% colnames(cspp_old)) %>%
  group_by(category) %>%
  summarise(n = n(), .groups = "drop")

cspp_cats_new$v <- "Update"
cspp_cats_old$v <- "Original"

cspp_cats <- rbind(cspp_cats_new,cspp_cats_old)

segment_data <- cspp_cats %>%
  group_by(category) %>%
  summarise(
    y_min = min(n, na.rm = TRUE),
    y_max = max(n, na.rm = TRUE),
    .groups = "drop"
  )

cspp_cats <- cspp_cats %>%
  mutate(category = fct_reorder(category, n, .desc = F))

segment_data <- segment_data %>%
  mutate(category = fct_reorder(category, y_max, .desc = F))

# Plot
ggplot(cspp_cats, aes(x = category, y = n, color = v)) +
  geom_segment(
    data = segment_data,
    aes(
      x = category,
      xend = category,
      y = y_min,
      yend = y_max
    ),
    inherit.aes = FALSE,
    color = "gray"
  ) +
  geom_point(size = 3) +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Figure 3. Correlates: Variables by Category",
    y = "Number of Variables",
    color = NULL
  ) +
  scale_y_continuous(
    expand = c(0, 0), 
    limits = c(0, 800), 
    breaks = seq(0, 800, 100), 
    labels = seq(0, 800, 100)
  ) +
  theme(
    text = element_text(family = "Fira Sans"),  
    axis.text.x = element_text(hjust = 1),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_rect(color = "#2C3E50", fill = NA, size = .5),
    plot.title = element_text(hjust = 0.5),
    legend.position = "top",
    legend.title = element_blank()
  )

#==============================================================================#
# Figure 4: CONGRESS Lollipop
#==============================================================================#

cong_cats <- CongressData::get_var_info()

cong_cats$category[is.na(cong_cats$category)] <- "All Other Variables"
cong_cats$category[cong_cats$category == "All Other Variables"] <- "All Other\nVariables"
cong_cats$category[cong_cats$category == "District Characteristics"] <- "District\nCharacteristics"
cong_cats$category[cong_cats$category == "Member Characteristics"] <- "Member\nCharacteristics"
cong_cats$category[cong_cats$category == "Congressional Bills"] <- "Congressional\nBills"

cong_cats <- cong_cats %>%
  group_by(category) %>%
  summarise(n = n())

ggpubr::ggdotchart(cong_cats, 
                   x = "category", 
                   y = "n",
                   rotate = TRUE,
                   sorting = "descending",
                   ggtheme = theme_bw(),
                   y.text.col = TRUE, 
                   dot.size = 3,
                   color = "#825fc4",
                   add = "segment") +
  theme_minimal() +
  ylab("Number of Variables") +
  labs(title = "Figure 4. CongressData: Variables by Category",
       y = "Number of Variables") +
  scale_y_continuous(
    expand = c(0, 0), 
    limits = c(0, 450), 
    breaks = seq(0, 450, 75),
    labels = seq(0, 450, 75)
  ) +  theme(
    text = element_text(family = "Fira Sans"),  
    axis.text.x = element_text(hjust = 1),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_rect(color = "#2C3E50", fill = NA, size = .5),
    plot.title = element_text(hjust = 0.5)
  )

#==============================================================================#
#==============================================================================#

# load senator bioguide ids
sen <- load(here("data","sen_bioguides.rda"))

bio <- read_csv(here("data","bioguides.csv")) %>%
  drop_na() %>%
  mutate(memID = str_trim(as.character(`Member ID`))) %>%
  filter(!memID %in% sen)

cong <- CongressData::get_cong_data() %>%
  mutate(bioguide = str_trim(as.character(bioguide)))

ids <- setdiff(unique(bio$memID), unique(cong$bioguide))

length(ids)/nrow(bio)

#==============================================================================#
# VALIDATION: NAMES
#==============================================================================#

le <- readxl::read_xlsx(here("data","LEPData93to110Congresses.xlsx")) %>%
  mutate(lastname = trimws(gsub(",.*", "", thomas_name)),
         firstname = trimws(gsub(".*?,", "", thomas_name)),
         fullname = paste(firstname,lastname, sep = " ")) %>%
  mutate(fullname = stringi::stri_trans_general(fullname, "Latin-ASCII"))

cong <- cong %>%
  mutate(fullname = paste(firstname,lastname, sep = " ")) %>%
  mutate(fullname = stringi::stri_trans_general(fullname, "Latin-ASCII"))

diff <- unique(setdiff(le$fullname, cong$fullname))

length(diff) / length(unique(le$fullname))


